************************************************************************

                            %POLYCHOR macro
                              Version: 1.2
              
   DISCLAIMER:
     THIS INFORMATION IS PROVIDED BY SAS INSTITUTE INC. AS A SERVICE TO
     ITS USERS.  IT IS PROVIDED "AS IS".  THERE ARE NO WARRANTIES,
     EXPRESSED OR IMPLIED, AS TO MERCHANTABILITY OR FITNESS FOR A
     PARTICULAR PURPOSE REGARDING THE ACCURACY OF THE MATERIALS OR CODE
     CONTAINED HEREIN.

   title:  Compute polychoric correlations

   PURPOSE:
     The POLYCHOR macro creates a SAS data set containing a correlation
     matrix of polychoric correlations or a distance matrix based on
     polychoric correlations.

   REQUIRES:
     %POLYCHOR requires only Version 6.07 or later of base SAS Software.

   USAGE:
     The options and allowable values are:

        DATA=   SAS data set to be analyzed.  If the DATA= option is not
                supplied, the most recently created SAS data set is
                used.

        VAR=    Polychoric or tetrachoric correlations will be computed
                for every pair of variables listed in the VAR= option.
                Individual variable names, separated by blanks, must be
                specified.  By default, all numeric variables found in
                the data set will be used.  See LIMITATIONS below for 
                time considerations.

        WEIGHT= Specifies the name of an input variable to be used
                as an observation weight. If specified, you must list
					 the VAR= variables explicitly, rather than using the
					 default (_NUMERIC_).

        OUT=    Specifies the name of the output data set that will 
                contain the correlation or distance matrix.  By default, 
                the output data set is named _PLCORR.

        TYPE=   Specifies the type of matrix to be created.  If 
                TYPE=CORR (the default), then a correlation matrix is 
                computed and the output data set is assigned a data set 
                type of CORR.  If TYPE=DISTANCE, then a distance matrix 
                is computed and the output dat set is assigned a data 
                set type of DISTANCE.

   PRINTED OUTPUT:
     No printed output is generated by the %POLYCHOR macro.

   DETAILS:
     The PLCORR option in the FREQ procedure is used iteratively to 
     compute the polychoric correlation for each pair of variables.  If 
     both variables in a pair are binary (that is, they take on only two 
     distinct values), then the correlation computed by the PLCORR 
     option is usually referred to as the tetrachoric correlation.

     The individual correlation coefficients are then assembled into 
     either a TYPE=CORR data set containing a matrix of polychoric 
     correlations, or a TYPE=DISTANCE data set containing a matrix of 
     dissimilarity values.  The dissimilarity value used is computed as:
                      
            1 - plcorr**2
                
     where plcorr is the polychoric correlation.  

     The resulting data set can be used for descriptive analyses only in
     either the FACTOR or the CALIS procedure (specify METHOD=ULS in
     either procedure) if the correlation matrix is computed.  If the
     maximum likelihood method (METHOD=ML) is used, note that none of
     the hypothesis tests will be valid, and the polychoric correlation
     matrix may be indefinite with small samples.  The distance matrix
     can be used in the CLUSTER procedure (however, the CCC value is not
     valid) or the MDS procedure.

     See Appendix 1, "Special SAS Data Sets" in SAS/STAT User's Guide,
     Version 6, Fourth Edition for a description of TYPE=CORR and
     DISTANCE data sets. 

   MISSING VALUES:
     Observations with missing values are omitted from the computation 
     of correlations.  However, when computing the polychoric 
     correlation between two variables, if an observation's values for 
     these two variables are not missing, then the observation is used
     regardless of any missing values the observation may have on other
     variables.

   LIMITATIONS:
     LIMITED ERROR CHECKING IS DONE.  If the DATA= option is specified,
     be sure the named data set exists.  If DATA= is not specified, a
     data set must have been created previously in the current SAS
     session.  Be sure that the variables specified in the VAR= option
     exist on that data set.  Running PROC CONTENTS on the data set
     prior to using this macro is recommended for verifying the data set
     name and the names of variables.  

     The time required to compute the correlation or distance matrix 
     increases quadratically as the number of variables increases.  Up 
     to 999 variables are allowed, but the time required for more than 
     100 variables may be exorbitant.

   EXAMPLE:

       data ordinal;
          array x{5} x1-x5; 
          do n=1 to 20;
             do i=1 to 5;
                x{i}=rantbl(238423,.1,.2,.4,.2,.1);
             end;
             keep x1-x5; 
             output;
          end; 
          run;

       * Create and print a TYPE=CORR data set named _PLCORR containing
         a matrix of polychoric correlations among all variables in the
         data set ORDINAL.                                            ;

       %polychor()
       proc print; run;


       * Create and print a TYPE=DISTANCE data set named DIST containing
         a dissimilarity matrix using variables X1, X2, and X5.       ;

       %polychor(data=ordinal,var=x1 x2 x5,out=dist,type=distance)
       proc print; run;

************************************************************************/


%macro polychor(
       data=_last_,
       var=_numeric_,
		 weight=,
       out=_plcorr,
       type=corr
       );

options nonotes;
%if &data=_last_ %then %let data=&syslast;

/* Verify that TYPE=CORR or DISTANCE */
%if %upcase(&type) ne CORR and %upcase(&type) ne DISTANCE %then %do;
  %put ERROR: TYPE= must be CORR or DISTANCE.;
  %goto exit;
%end;

data _null_;
 set &data;
 array x{*} &var;
 length name $8.;
 if _n_=1 then
 do i=1 to dim(x);
   call vname(x{i} , name);
   call symput('_v'||trim(left(put(i,4.))) , name);
 end; 
 p=dim(x);
 call symput('_p',trim(left(put(p,4.))));
 run;

%do _i=1 %to &_p;
%do _j=&_i+1 %to &_p;
  proc freq data=&data noprint; 
	  %if %length(&weight) %then %do;
  		weight &weight;
  		%end;
    tables &&_v&_i * &&_v&_j / plcorr;
    output out=_tmp plcorr;
    run;
  data _null_;
    set _tmp;
    value=    %if %upcase(&type)=CORR %then _plcorr_;
              %if %upcase(&type)=DISTANCE %then 1-_plcorr_**2;
    ;
    call symput("p&_i._&_j" , value);
    run;
%end;
%end;

data &out
  %if %upcase(&type)=CORR %then %do;
    ;
    _type_='CORR';
    length _name_ $8.;
  %end;
  %if %upcase(&type)=DISTANCE %then %str( (type=distance); );

  /* Create matrix */
  array x{*}     %do i=1 %to &_p;
                     &&_v&i
                 %end;
    ;
  do i=1 to dim(x);
    do j=1 to i;

      /* Set diagonal values */
      if i=j then x{j}=   %if %upcase(&type)=CORR %then 1;
                          %if %upcase(&type)=DISTANCE %then 0;
      ;

      /* Set lower triangular values */
      else
      x{j}=symget("p"||trim(left(put(j,4.)))||"_"||trim(left(put(i,4.))));
    end;

    /* Create _NAME_ variable for CORR data sets */
    %if %upcase(&type)=CORR %then 
      %str( _name_=symget("_v"||trim(left(put(i,4.)))); );
    drop i j;
    output;
  end;
  run;

/* Add _TYPE_=MEAN, STD and N observations to CORR data sets */
%if %upcase(&type)=CORR %then %do;
  proc summary data=&data;
    var &var;
    output out=_simple (drop=_type_ _freq_ rename=(_stat_=_type_));
    run;
  data &out (type=corr);
    set _simple (where=(_type_ in ('MEAN','STD','N'))) &out;
    run;
%end;

options notes;
%if &syserr=0 %then 
%if %upcase(&type)=CORR %then
  %put NOTE: Polychoric correlation matrix was output to data set %upcase(&out).;
%else %do;
  %put NOTE: Distance matrix based on polychoric correlations was output;
  %put %str(      to data set %upcase(&out).);
%end;

%exit:
%mend;
%polychor;
proc print data=_PLCORR; run;
/*** Method=ml gives faulty results communalities > 1; method=image works OK 
This agrees closely with SPSS ML RESULTS AND EQS
In fact method=image gives a perfect split between positive and negative worded items
and a two factor solution 
The direct oblimin results are outputted for the method=image but not method=ml confirming
method=ml is giving a faulty output ***/
proc factor data=_PLCORR  corr method=image priors=smc rotate=oblimin;
run;